import calendar
import time
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
import warnings
#matplotlib.style.use('ggplot')
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
warnings.filterwarnings("ignore")
%matplotlib inline
color = sns.color_palette("hls", 8)
sns.set_style("whitegrid")
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as offline
import plotly.graph_objs as go
# import cufflinks and offline mode
import cufflinks as cf
cf.go_offline()
print('plotly version:', __version__)
init_notebook_mode(connected=True)
train_df = pd.read_csv("train_2016_v2.csv", parse_dates=["transactiondate"])
prop_df = pd.read_csv("properties_2016.csv")
print ("Shape Of Train: ", train_df.shape)
print ("Shape Of Properties: ", prop_df.shape)
train_df.head()
prop_df.head()
List of real estate properties in 3 counties (Los Angeles, Orange and Ventura, California) data in 2016.
90,275 rows in train, 2,985,217 rows in properties file. Merge 2 files and then carry out analysis.
train_df = pd.merge(train_df,prop_df,on="parcelid",how="left")
train_df.head()
pd.options.display.max_rows = 65
dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df
dtype_df.groupby("Column Type").aggregate('count').reset_index()
dataTypeDf = pd.DataFrame(train_df.dtypes.value_counts()).reset_index().rename(columns={"index":"variableType",0:"count"})
# Create a trace
trace = go.Bar(
x = dataTypeDf["variableType"].astype(str),
y = dataTypeDf["count"],
)
data = [trace]
# Edit the layout
layout = dict(title = "Variables Count Across Datatype",
xaxis = dict(title = "VariableType"),
yaxis = dict(title = "Count"),
font = dict(size=15),
autosize = False,
width = 800,
height = 500,
)
fig = dict(data=data, layout=layout)
iplot(fig)
missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / train_df.shape[0]
missing_df.loc[missing_df['missing_ratio']>0.995]
4 columns have missing values 99.9% of the times.!
Logerror:
Target variable for this competition is "logerror" field.
# Create a trace
trace = go.Scatter(
x = range(train_df.shape[0]),
y = np.sort(train_df.logerror.values),
#mode= 'markers',
#marker= dict(size= 4, line= dict(width=1), opacity= 0.3, )
)
data = [trace]
# Edit the layout
layout = dict(title = 'Logerror distribution',
xaxis = dict(title = 'index'),
yaxis = dict(title = 'logerror'),
font = dict(size=16),
autosize = False,
width = 600,
height = 500,
)
fig = dict(data=data, layout=layout)
iplot(fig)
Outliers at both the ends!
Remove the outliers and then do a histogram plot on the same.
ulimit = np.percentile(train_df.logerror.values, 99)
llimit = np.percentile(train_df.logerror.values, 1)
train_df['logerror'].ix[train_df['logerror']>ulimit] = ulimit
train_df['logerror'].ix[train_df['logerror']<llimit] = llimit
import plotly.figure_factory as ff
hist_data = [train_df.logerror.values]
group_labels = ['logerror']
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.01)
# Plot!
iplot(fig)
data = [go.Histogram(x=train_df.logerror.values)]
iplot(data)
Transaction Date:
Date field. Check number of transactions in each month.
train_df['transaction_month'] = train_df['transactiondate'].dt.month
cnt_srs = train_df['transaction_month'].value_counts()
data = [
go.Bar(
x=cnt_srs.index,
y=cnt_srs.values,
)]
# Edit the layout
layout = dict(title = 'Transaction distribution',
xaxis = dict(title = 'Month of transaction'),
yaxis = dict(title = 'Number of Occurrences'),
font = dict(size=16),
)
fig = dict(data=data, layout=layout)
iplot(fig)
Train data has all transactions before October 15, 2016, and some of the transactions after October 15, 2016.
So shorter bars in last 3 months.
Parcel Id:
(train_df['parcelid'].value_counts().reset_index())['parcelid'].value_counts()
Most parcel ids are appearing only once in the dataset.
missingValueColumns = train_df.columns[train_df.isnull().any()]
msno.bar(train_df[missingValueColumns],\
figsize=(20,8),color='blue',fontsize=12,labels=True)
missing_df = prop_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')
data = [go.Bar(
x = missing_df.missing_count.values,
y = missing_df.column_name,
orientation = 'h',
)]
# Edit the layout
layout = dict(title = "Number of missing values in each column",
xaxis = dict(title = "Count of missing values"),
yaxis = dict(tickangle=35,
tickfont=dict(size=9)),
font = dict(size=8),
autosize = False,
width = 900,
height = 990,
)
fig = dict(data=data, layout=layout)
iplot(fig)
Univariate Analysis:
Since there are so many variables, investigate 'float' variables alone and then get the correlation with the target variable.
# Let us just impute the missing values with mean values to compute correlation coefficients #
mean_values = train_df.mean(axis=0)
train_df_new = train_df.fillna(mean_values, inplace=True)
# Now let us look at the correlation coefficient of each of these variables #
x_cols = [col for col in train_df_new.columns if col not in ['logerror'] if train_df_new[col].dtype=='float64']
labels = []
values = []
for col in x_cols:
labels.append(col)
values.append(np.corrcoef(train_df_new[col].values, train_df_new.logerror.values)[0,1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
data = [go.Bar(
x = np.array(corr_df.corr_values.values),
y = corr_df['col_labels'],
orientation = 'h',
)]
# Edit the layout
layout = dict(title = "Correlation coefficient of the variables",
xaxis = dict(title = "Correlation coefficient"),
yaxis = dict(tickangle=35,
tickfont=dict(size=9)),
font = dict(size=12),
autosize = False,
width = 900,
height = 990,
)
fig = dict(data=data, layout=layout)
iplot(fig)
The correlation of the target variable with the given set of variables is low overall.
A few variables at the top of this graph has no correlation values. There may be only one unique value and hence no correlation value.
corr_zero_cols = ['assessmentyear', 'storytypeid', 'pooltypeid2', 'pooltypeid7', 'pooltypeid10', 'poolcnt', 'decktypeid', 'buildingclasstypeid']
for col in corr_zero_cols:
print(col, len(train_df_new[col].unique()))
Check out variables with high correlation values.
corr_df_sel = corr_df.ix[(corr_df['corr_values']>0.02) | (corr_df['corr_values'] < -0.01)]
corr_df_sel
cols_to_use = corr_df_sel.col_labels.tolist()
temp_df = train_df[cols_to_use]
corrmat = temp_df.corr(method='spearman')
trace = go.Heatmap(z=np.array(corrmat),
x = cols_to_use,
y = cols_to_use,
#colorscale= 'Jet')
colorscale=[[0.0000000000000000, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'],
[0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'],
[0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'],
[0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'],
[0.8888888888888888, 'rgb(69,117,180)'], [1.0000000000000000, 'rgb(49,54,149)']],)
#colorscale=[[1.0000000000000000, 'rgb(165,0,38)'], [0.8888888888888888, 'rgb(215,48,39)'],
# [0.7777777777777778, 'rgb(244,109,67)'], [0.6666666666666666, 'rgb(253,174,97)'],
# [0.5555555555555556, 'rgb(254,224,144)'], [0.4444444444444444, 'rgb(224,243,248)'],
# [0.3333333333333333, 'rgb(171,217,233)'], [0.2222222222222222, 'rgb(116,173,209)'],
# [0.1111111111111111, 'rgb(69,117,180)'], [0.0000000000000000, 'rgb(49,54,149)']],)
data=[trace]
# Edit the layout
layout = dict(title = "Important variables correlation map",
font = dict(size=12),
autosize = False,
width = 500,
height = 500,
)
fig = dict(data=data, layout=layout)
iplot(fig)
Let us now look at each of them. Investigate individually.
Bathroom Count:
data = [
go.Histogram(
x=train_df['bathroomcnt'],
histnorm='count',
#marker=dict(colorscale='Jet',),
#opacity=0.75
)]
# Edit the layout
layout = dict(title = 'Frequency of Bathroom count',
xaxis = dict(title = 'Bathroom'),
yaxis = dict(title = 'Count'),
font = dict(size=16),
autosize = False,
width = 800,
height = 500,
bargap=0.2,
)
fig = dict(data=data, layout=layout)
iplot(fig)
src = list(train_df['bathroomcnt'].values)
result_dict = dict( [ (i, src.count(i)) for i in set(src) ] )
f = train_df.sort_values(by=['bathroomcnt'], ascending=[True])
N = len(result_dict) # Number of boxes
nbathroom = sorted(result_dict.keys())
# generate an array of rainbow colors by fixing the saturation and lightness of the HSL representation of colour
# and marching around the hue.
# Plotly accepts any CSS color format, see e.g. http://www.w3schools.com/cssref/css_colors_legal.asp.
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]
# Each box is represented by a dict that contains the data, the type, and the colour.
# Use list comprehension to describe N boxes, each with a different colour and with different randomly generated data:
data = [{
'x': nbathroom[i],
'y': f['logerror'][f['bathroomcnt']==nbathroom[i]],
'name':nbathroom[i],
'type':'box',
'marker':{'color': c[i]}
} for i in range(int(N))]
# format the layout
#layout = {'xaxis': {'showgrid':False,'zeroline':False, 'tickangle':60,'showticklabels':False},
# 'yaxis': {'zeroline':False,'gridcolor':'white'},
# 'paper_bgcolor': 'rgb(233,233,233)',
# 'plot_bgcolor': 'rgb(233,233,233)',
# }
# Edit the layout
layout = dict(title = "Logerror vs Bathroom count",
xaxis = dict(title = "Bathroom count"),
yaxis = dict(title = "Logerror",
tickangle=0,
tickfont=dict(size=9)),
font = dict(size=12),
autosize = False,
width = 900,
height = 700,
)
fig = dict(data=data, layout=layout)
iplot(fig)
data = [
go.Histogram(
x=train_df['bedroomcnt'],
histnorm='count',
#marker=dict(colorscale='Jet',),
#opacity=0.75
)]
# Edit the layout
layout = dict(title = 'Frequency of Bedroom count',
xaxis = dict(title = 'Bedroom count'),
yaxis = dict(title = 'Frequency'),
font = dict(size=16),
autosize = False,
width = 800,
height = 500,
bargap=0.2,
)
fig = dict(data=data, layout=layout)
iplot(fig)
3.03 is the mean value with which we replaced the Null values.
train_df['bedroomcnt'].ix[train_df['bedroomcnt']>7] = 7
plt.figure(figsize=(12,8))
sns.violinplot(x='bedroomcnt', y='logerror', data=train_df)
plt.xlabel('Bedroom count', fontsize=12)
plt.ylabel('Log Error', fontsize=12)
plt.show()
train_df['bedroomcnt'].ix[train_df['bedroomcnt']>7] = 7
fig = ff.create_violin(train_df, data_header='logerror', group_header='bedroomcnt')
# Edit the layout
layout = dict(title='Log Error vs Bedroom count',
xaxis = dict(title = 'Bedroom count'),
yaxis = dict(title = 'Log Error'),
font = dict(size=16),
autosize = False,
width = 800,
height = 500,
bargap=0.2,
)
iplot(fig, layout)
col = "taxamount"
ulimit = np.percentile(train_df[col].values, 99)
llimit = np.percentile(train_df[col].values, 1)
train_df[col].ix[train_df[col]>ulimit] = ulimit
train_df[col].ix[train_df[col]<llimit] = llimit
plt.figure(figsize=(12,12))
sns.jointplot(x=train_df['taxamount'].values, y=train_df['logerror'].values, size=10, color='g')
plt.ylabel('Log Error', fontsize=12)
plt.xlabel('Tax Amount', fontsize=12)
plt.title("Tax Amount Vs Log error", fontsize=15)
plt.show()
from ggplot import *
ggplot(aes(x='yearbuilt', y='logerror'), data=train_df) + \
geom_point(color='steelblue', size=1) + \
stat_smooth()
ggplot(aes(x='latitude', y='longitude', color='logerror'), data=train_df) + \
geom_point() + \
scale_color_gradient(low = 'red', high = 'blue')
ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df) + \
geom_point(alpha=0.7) + \
scale_color_gradient(low = 'pink', high = 'blue')
import sklearn
from sklearn import ensemble, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
train_y = train_df['logerror'].values
cat_cols = ["hashottuborspa", "propertycountylandusecode", "propertyzoningdesc", "fireplaceflag", "taxdelinquencyflag"]
train_df = train_df.drop(['parcelid', 'logerror', 'transactiondate', 'transaction_month']+cat_cols, axis=1)
feat_names = train_df.columns.values
model = ensemble.ExtraTreesRegressor(n_estimators=25, max_depth=30, max_features=0.3, n_jobs=-1, random_state=0)
model.fit(train_df, train_y)
## plot the importances ##
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]
plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()
#SKlearn: Linear Regression & ExtraTreesRegressor
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV
start = time.time()
print('Loading data ...')
train = pd.read_csv('train_2016_v2.csv')
prop = pd.read_csv('properties_2016.csv')
sample = pd.read_csv('sample_submission.csv')
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
print('Creating training set ...')
df_train = train.merge(prop, how='left', on='parcelid')
x_train = df_train.drop(['parcelid',
'logerror',
'transactiondate',
'propertyzoningdesc',
'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
print('deleting df_train ...')
del df_train; gc.collect()
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
mean_values = x_train.mean(axis=0)
x_train = x_train.fillna(mean_values, inplace=True)
mean_values = x_valid.mean(axis=0)
x_valid = x_valid.fillna(mean_values, inplace=True)
model = linear_model.LinearRegression()
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred = model.predict(x_valid)
print('LinearRegression Model validation set MAE: ' + str(mean_absolute_error(y_valid, pred)))
print()
model = linear_model.RANSACRegressor()
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred = model.predict(x_valid)
print('RANSACRegressor Model validation set MAE: ' + str(mean_absolute_error(y_valid, pred)))
print()
model = linear_model.TheilSenRegressor(random_state=42)
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred = model.predict(x_valid)
print('TheilSenRegressor Model validation set MAE: ' + str(mean_absolute_error(y_valid, pred)))
print()
model = linear_model.BayesianRidge(compute_score=True)
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred = model.predict(x_valid)
print('BayesianRidge Model validation set MAE: ' + str(mean_absolute_error(y_valid, pred)))
print()
model = ensemble.ExtraTreesRegressor(n_estimators=50, max_depth=30, max_features=0.3, random_state=777)
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred = model.predict(x_valid)
print('ExtraTreesRegressor Model validation set MAE: ' + str(mean_absolute_error(y_valid, pred)))
xtrain, xtest, ytrain, ytest = train_test_split(train_df, train_y,
test_size=0.11,
random_state=2017)
model = linear_model.LinearRegression()
model.fit(xtrain, ytrain)
pred = model.predict(xtest)
mean_absolute_error(ytest, pred)
xtrain, xtest, ytrain, ytest = train_test_split(train_df, train_y,
test_size=0.11,
random_state=2017)
model = ensemble.ExtraTreesRegressor(n_estimators=100, max_depth=30, max_features=0.3, n_jobs=-1, random_state=777)
model.fit(xtrain, ytrain)
pred = model.predict(xtest)
mean_absolute_error(ytest, pred)
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV
start = time.time()
print('Loading data ...')
train = pd.read_csv('train_2016_v2.csv')
prop = pd.read_csv('properties_2016.csv')
sample = pd.read_csv('sample_submission.csv')
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
print('Creating training set ...')
df_train = train.merge(prop, how='left', on='parcelid')
x_train = df_train.drop(['parcelid',
'logerror',
'transactiondate',
'propertyzoningdesc',
'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
c1 = df_train['logerror'].mean()
c2 = df_train['logerror'].median()
print('Logerror mean: ' + str(c1))
print('Logerror median: ' + str(c2))
print('deleting df_train ...')
del df_train; gc.collect()
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
mean_values = x_train.mean(axis=0)
x_train = x_train.fillna(mean_values, inplace=True)
#model = ensemble.ExtraTreesRegressor(n_estimators=100, max_depth=30, max_features=0.3, n_jobs=-1, random_state=777)
parameters = {'n_estimators':[50, 75, 100], 'max_depth':[10, 30, 50], 'max_features':[0.1, 0.3, 0.5]}
extratree = ensemble.ExtraTreesRegressor() #criterion='mae'
model = GridSearchCV(extratree, parameters,verbose=2,scoring=make_scorer(mean_absolute_error))
model.fit(x_train, y_train)
#model.cv_results_
model.best_params_
mean_values = x_valid.mean(axis=0)
x_valid = x_valid.fillna(mean_values, inplace=True)
dummy1 = DummyClassifier(constant=c1,random_state=0)
dummy2 = DummyClassifier(constant=c2,random_state=0)
dummy1.fit(x_train, y_train)
dummy2.fit(x_train, y_train)
print('Predicting on validation set ...')
pred = model.predict(x_valid)
pred1 = dummy1.predict(x_valid)
pred2 = dummy2.predict(x_valid)
print('Model validation set MAE: ' + str(mean_absolute_error(y_valid, pred)))
print('Mean Dummy regressor validation set MAE: ' + str(mean_absolute_error(y_valid, pred1)))
print('Median Dummy regressor validation set MAE: '+ str(mean_absolute_error(y_valid, pred2)))
print('Building test set ...')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
print('x_test ...')
x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
print('Fill NA ...')
mean_values = x_test.mean(axis=0)
x_test = x_test.fillna(mean_values, inplace=True)
print('Predicting on test ...')
p_test = model.predict(x_test)
#p_test = 0.97*p_test + 0.03*0.011
print('deleting x_test ...')
del x_test; gc.collect()
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
sub[c] = p_test
end = time.time()
print('time taken: ' + str(end - start))
print('Writing csv ...')
sub.to_csv('ExtraTrees_starter.csv', index=False, float_format='%.4f')
import xgboost as xgb
print(xgb.__version__)
xgb_params = {
'eta': 0.05,
'max_depth': 8,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'silent': 1,
'seed' : 0
}
dtrain = xgb.DMatrix(train_df, train_y, feature_names=train_df.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)
# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
print('Loading data ...')
train = pd.read_csv('train_2016_v2.csv')
prop = pd.read_csv('properties_2016.csv')
sample = pd.read_csv('sample_submission.csv')
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
print('Creating training set ...')
df_train = train.merge(prop, how='left', on='parcelid')
x_train = df_train.drop(['parcelid',
'logerror',
'transactiondate',
'propertyzoningdesc',
'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
del df_train; gc.collect()
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
print('Building DMatrix...')
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
del x_train; gc.collect() #, x_valid
print('Training ...')
params = {
'eta' : 0.02,
'objective' : 'reg:linear',
'eval_metric' : 'mae',
'max_depth' : 4,
'silent' : 1,
}
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params,
d_train,
10000,
watchlist,
early_stopping_rounds=100,
verbose_eval=10)
del d_train #, d_valid
print('Building test set ...')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
del prop; gc.collect()
x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
del df_test, sample; gc.collect()
d_test = xgb.DMatrix(x_test)
del x_test; gc.collect()
print('Predicting on validation set ...')
pred = clf.predict(d_valid)
pred1 = dummy1.predict(x_valid)
pred2 = dummy2.predict(x_valid)
print('Validation set MAE: '+ str(mean_absolute_error(y_valid, pred)))
print('Mean Dummy regressor validation set MAE: ' + str(mean_absolute_error(y_valid, pred1)))
print('Median Dummy regresor validation set MAE: '+ str(mean_absolute_error(y_valid, pred2)))
print('Predicting on test ...')
p_test = clf.predict(d_test)
p_test = 0.97*p_test + 0.03*0.011
del d_test; gc.collect()
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
sub[c] = p_test
print('Writing csv ...')
sub.to_csv('xgb_starter.csv', index=False, float_format='%.4f')
# Any results you write to the current directory are saved as output.
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
print('Loading data ...')
train = pd.read_csv('train_2016_v2.csv')
prop = pd.read_csv('properties_2016.csv')
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
df_train = train.merge(prop, how='left', on='parcelid')
x_train = df_train.drop(['parcelid',
'logerror',
'transactiondate',
'propertyzoningdesc',
'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
del df_train; gc.collect()
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
x_train = x_train.values.astype(np.float32, copy=False)
x_valid = x_valid.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
params = {
'max_bin' : 20,
'learning_rate' : 0.0021, # shrinkage_rate
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'l1', # or 'mae'
'sub_feature' : 0.5, # feature_fraction
'bagging_fraction': 0.85, # sub_row
'bagging_freq' : 40,
'num_leaves' : 512, # num_leaf
'min_data' : 500, # min_data_in_leaf
'min_hessian' : 0.05, # min_sum_hessian_in_leaf
}
watchlist = [d_valid]
clf = lgb.train(params, d_train, 500, watchlist)
del d_train, d_valid; gc.collect()
del x_train; gc.collect() #, x_valid
print("Prepare for the prediction ...")
sample = pd.read_csv('sample_submission.csv')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop; gc.collect()
x_test = df_test[train_columns]
del df_test; gc.collect()
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)
print('Predicting on validation set ...')
pred = clf.predict(x_valid)
print('Validation set MAE: '+ str(mean_absolute_error(y_valid, pred)))
print("Start prediction ...")
# num_threads > 1 will predict very slow in kernel
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)
p_test = 0.97*p_test + 0.03*0.011
del x_test; gc.collect()
print("Start write result ...")
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
sub[c] = p_test
sub.to_csv('lgb_starter_1.csv', index=False, float_format='%.4f')
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
print('Loading data ...')
train = pd.read_csv('train_2016_v2.csv')
prop = pd.read_csv('properties_2016.csv')
for c, dtype in zip(prop.columns, prop.dtypes):
if dtype == np.float64:
prop[c] = prop[c].astype(np.float32)
df_train = train.merge(prop, how='left', on='parcelid')
x_train = df_train.drop(['parcelid',
'logerror',
'transactiondate',
'propertyzoningdesc',
'propertycountylandusecode'], axis=1)
x_train = x_train[['calculatedfinishedsquarefeet','structuretaxvaluedollarcnt','latitude','longitude','taxvaluedollarcnt',\
'yearbuilt','taxamount','lotsizesquarefeet','landtaxvaluedollarcnt','bathroomcnt']]
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
x_train[c] = (x_train[c] == True)
del df_train; gc.collect()
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
x_train = x_train.values.astype(np.float32, copy=False)
x_valid = x_valid.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)
params = {
'max_bin' : 20,
'learning_rate' : 0.0021, # shrinkage_rate
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'l1', # or 'mae'
'sub_feature' : 0.5, # feature_fraction
'bagging_fraction': 0.85, # sub_row
'bagging_freq' : 40,
'num_leaves' : 512, # num_leaf
'min_data' : 500, # min_data_in_leaf
'min_hessian' : 0.05, # min_sum_hessian_in_leaf
}
watchlist = [d_valid]
clf = lgb.train(params, d_train, 500, watchlist)
del d_train, d_valid; gc.collect()
del x_train; gc.collect() #, x_valid
print("Prepare for the prediction ...")
sample = pd.read_csv('sample_submission.csv')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop; gc.collect()
x_test = df_test[train_columns]
del df_test; gc.collect()
for c in x_test.dtypes[x_test.dtypes == object].index.values:
x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)
print('Predicting on validation set ...')
pred = clf.predict(x_valid)
print('Validation set MAE: '+ str(mean_absolute_error(y_valid, pred)))
print("Start prediction ...")
# num_threads > 1 will predict very slow in kernel
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)
p_test = 0.97*p_test + 0.03*0.011
del x_test; gc.collect()
print("Start write result ...")
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
sub[c] = p_test
sub.to_csv('lgb_starter_f.csv', index=False, float_format='%.4f')